In [1]:
import csv
import nltk
import math
import collections
from textblob import TextBlob
from pprint import pprint
In [2]:
csvfile = open('bernie-sanders-announces.csv','r')
reader = csv.reader(csvfile)
data = []
for line in reader:
line[3] = line[3].decode('utf-8')
data.append(line)
In [3]:
len(data)
Out[3]:
In [4]:
data[0]
Out[4]:
In [5]:
data[1]
Out[5]:
In [6]:
comment_text = data[1][-1]
In [7]:
comment_text
Out[7]:
In [8]:
comment_text[0]
Out[8]:
In [9]:
comment_text[2:6]
Out[9]:
In [10]:
comment_text + comment_text
Out[10]:
In [11]:
# tab complete
comment_text.split()
Out[11]:
In [12]:
split_on_questions = comment_text.split('?')
split_on_questions
Out[12]:
In [13]:
for string in split_on_questions:
print string.strip()
In [14]:
cleaned = [s.strip().lower() for s in split_on_questions]
cleaned
Out[14]:
In [15]:
'?!?! '.join(cleaned)
Out[15]:
In [16]:
'Hilary' in data[80][-1]
Out[16]:
In [17]:
clinton_count = 0
for row in data:
if 'Hilary' in row[-1] or 'Clinton' in row[-1]:
clinton_count += 1
clinton_count
Out[17]:
In [18]:
blob = TextBlob(data[80][-1])
blob
Out[18]:
In [19]:
blob.sentences
Out[19]:
In [20]:
blob.words
Out[20]:
In [21]:
blob.tokens
Out[21]:
In [22]:
blob.noun_phrases
Out[22]:
In [23]:
blob.word_counts
Out[23]:
In [27]:
word_count = collections.Counter(blob.word_counts)
In [28]:
word_count.most_common(5)
Out[28]:
In [29]:
stopwords = nltk.corpus.stopwords.words('english')
In [30]:
nltk.download()
Out[30]:
In [31]:
for key in word_count.keys():
if key in stopwords or len(key) <= 2:
del word_count[key]
In [32]:
word_count.most_common(5)
Out[32]:
We could continue to add on stopwords as we try to make these keywords better. But it's kind of like playing whack-a-mole
An additional solution to The Problem: add a new term to our "representative-ness" measure that accounts for the overall rarity of the word
$$\frac { { n }_{ w } }{ N } $$where ${ n }_{ w }$ is the number of documents containing word $ w $, and $ N $ is the total number of documents.
But we want a potential keyword to have a lower score if it is common in the corpus and a higher score if it is rarer, so we flip it:
$$\frac { N }{ { n }_{ w } } $$It's also common to take the log of this to reduce the amount of disparity between extremely common and extremely uncommon terms.
$$ \log \frac { N }{ { n }_{ w } } $$This is called IDF, or Inverse Document Frequency. Let's calculate it for all the words in our comment dataset!
In [33]:
N_documents = float(len(data))
word_document_counts = collections.Counter()
word_idf = {}
In [34]:
for row in data[1:]:
blob = TextBlob(row[-1].lower())
words = blob.word_counts.keys()
word_document_counts.update(words)
In [35]:
for key, val in word_document_counts.iteritems():
word_idf[key] = math.log(N_documents/val)
For each word $ w $ in a given document $ D $, we can multiply the term frequency $$\frac { { D }_{ w } }{ { W }_{ D } } $$
where $ { D }_{ w } $ is the number of occurrences of word $ w $ in document $ D $
and $ { W }_{ D } $ is the total number of words in document $ D $
with the word's IDF that we just calculated to get TF-IDF scores, the highest ones being words that likely to be good representatives of that document.
In [36]:
comment = data[80][-1]
blob = TextBlob(comment.lower())
num_words_in_comment = len(blob.words)
word_count = blob.word_counts
tf_scores = {}
for word, count in word_count.items():
if word not in stopwords and len(word) > 2:
tf_scores[word] = float(count)/num_words_in_comment
In [37]:
tf_idf = {}
for word, tf in tf_scores.items():
tf_idf[word] = tf*word_idf[word]
sorted(tf_idf.iteritems(), key=lambda k: k[1], reverse=True)[:5]
Out[37]:
Note that TF-IDF can be tweaked in lots of other ways if you aren't getting good results.
It can also be done with "n-grams"— phrases that are n words long to capture multi word phrases like "gay rights" or "hillary clinton"
In [38]:
from nltk.stem.porter import PorterStemmer
In [39]:
stemmer = PorterStemmer()
print stemmer.stem('political')
print stemmer.stem('politics')
print stemmer.stem('politician')
In [40]:
from nltk.text import Text
tokens = TextBlob(data[80][-1]).tokens
text_object = Text(tokens)
text_object.concordance('Hilary')
In [87]:
blob = TextBlob(data[41][-1])
blob
Out[87]:
In [88]:
blob.sentiment
Out[88]:
In [90]:
blob.sentences[1].sentiment
Out[90]: